library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(purrr)
library(dplyr)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(tools)
prevalence_df = read_csv("data/PLACES__Local_Data_for_Better_Health__County_Data_2023_release.csv") |>
  janitor::clean_names() |>
  select(year, state_abbr, category, measure, data_value, total_population, category_id, measure_id, short_question_text) |>
  filter(year == 2021)
## Rows: 228770 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): StateAbbr, StateDesc, LocationName, DataSource, Category, Measure,...
## dbl  (5): Year, Data_Value, Low_Confidence_Limit, High_Confidence_Limit, Tot...
## lgl  (2): Data_Value_Footnote_Symbol, Data_Value_Footnote
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
prevalence_outcome = prevalence_df |>
  filter(category == "Health Outcomes")
  
prevalence_chol = prevalence_outcome |>
  filter(measure_id == "HIGHCHOL") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(chol_pre_2021 = disease_popu / total_popu) |>
  select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
prevalence_risk = prevalence_df |>
  filter(category == "Health Risk Behaviors")
  

risk_lpa = prevalence_risk |>
  filter(measure_id == "LPA") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(lpa_pre = disease_popu / total_popu) |>
  select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
risk_binge = prevalence_risk |>
  filter(measure_id == "BINGE") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(binge_pre = disease_popu / total_popu) |>
  select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
risk_smoking = prevalence_risk |>
  filter(measure_id == "CSMOKING") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(smoking_pre = disease_popu / total_popu) |>
  select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
prevalence_prevention = prevalence_df |>
  filter(category == "Prevention")

prevention_insurance = prevalence_prevention |>
  filter(measure_id == "ACCESS2") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(insurance_pre = disease_popu / total_popu) |>
  select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
prevention_cholscreen = prevalence_prevention |>
  filter(measure_id == "CHOLSCREEN") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(cholscreen_pre = disease_popu / total_popu) |>
  select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
prevalence_df_2020 = read_csv("data/PLACES__Local_Data_for_Better_Health__County_Data_2023_release.csv") |>
  janitor::clean_names() |>
  select(year, state_abbr, category, measure, data_value, total_population, category_id, measure_id, short_question_text) |>
  filter(year == 2020)
## Rows: 228770 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): StateAbbr, StateDesc, LocationName, DataSource, Category, Measure,...
## dbl  (5): Year, Data_Value, Low_Confidence_Limit, High_Confidence_Limit, Tot...
## lgl  (2): Data_Value_Footnote_Symbol, Data_Value_Footnote
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
risk_sleep_2020 = prevalence_df_2020 |>
  filter(category == "Health Risk Behaviors") |>
  filter(measure_id == "SLEEP") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(sleep_pre = disease_popu / total_popu) |>
  select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
#anti_join(risk_sleep,risk_binge, by = 'state_abbr')
#FL in sleep but not in chol & other dfs

prevalence_chol_2020 = prevalence_df |>
  filter(category == "Health Outcomes") |>
  filter(measure_id == "HIGHCHOL") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(chol_pre_2020 = disease_popu / total_popu) |>
  select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
merged_2021 = Reduce(function(df1, df2) left_join(df1, df2, by = c("state_abbr", "total_popu")), list(prevalence_chol, risk_binge, risk_lpa, risk_smoking, prevention_insurance, prevention_cholscreen))

merged_2020 = left_join(prevalence_chol_2020, risk_sleep_2020)
## Joining with `by = join_by(state_abbr, total_popu)`
merged_total = left_join(merged_2021, merged_2020)
## Joining with `by = join_by(state_abbr, total_popu)`

Here is the prevalence plot!

x_range = range(c(pull(merged_total, chol_pre_2021), pull(merged_total, chol_pre_2020)))

y_range = range(c(pull(merged_total, binge_pre), pull(merged_total, lpa_pre), pull(merged_total, smoking_pre), pull(merged_total, insurance_pre), pull(merged_total, cholscreen_pre), pull(merged_total, sleep_pre)))

prevalence_plot = 
merged_total |>
  plot_ly() |>
  add_trace(x = ~chol_pre_2021, y = ~binge_pre, type = 'scatter', mode = 'markers', opacity = 0.75, marker = list(size = 9), name = 'Binge Drinking', showlegend = TRUE, visible = TRUE) |>
  add_trace(x = ~chol_pre_2021, y = ~lpa_pre, type = 'scatter', mode = 'markers', opacity = 0.75, marker = list(size = 9), name = 'No Leisure-time Physical Activity', showlegend = TRUE, visible = TRUE) |>
  add_trace(x = ~chol_pre_2021, y = ~smoking_pre, type = 'scatter', mode = 'markers', opacity = 0.75, marker = list(size = 9), name = 'Smoking Currently', showlegend = TRUE, visible = TRUE) |>
  add_trace(x = ~chol_pre_2021, y = ~insurance_pre, type = 'scatter', mode = 'markers', opacity = 0.75, marker = list(size = 9), name = 'Lack of Health Insurance Currently', showlegend = TRUE, visible = TRUE) |>
  add_trace(x = ~chol_pre_2021, y = ~cholscreen_pre, type = 'scatter', mode = 'markers', opacity = 0.75, marker = list(size = 9), name = 'Cholesterol Screening', showlegend = TRUE, visible = TRUE) |>
  add_trace(x = ~chol_pre_2021, y = ~sleep_pre, type = 'scatter', mode = 'markers', opacity = 0.75, marker = list(size = 9), name = 'Sleep Less Than 7 Hours', showlegend = TRUE, visible = TRUE) |>
  
  layout(title = 'High Cholesterol v.s. Health-Affecting Behaviors',
         xaxis = list(title = 'Prevalence of High Cholesterol', range = x_range + c(-0.01, 0.01), tickformat = '.0%', showline = TRUE),
         yaxis = list(title = 'Prevalence of Health-Affecting Behavior', range = y_range + c(-0.05, 0.05), tickformat = '.0%', showline = TRUE))
         
prevalence_plot

Find top health behaviors that has largest correlations with high cholesterol.

cor_table = data.frame(behavior = character(), correlation = numeric(), stringsAsFactors = FALSE)

behavs = colnames(merged_2021[4:8])

for (each in behavs) {
  new_row = data.frame(behavior = substr(each, 1, nchar(each) - 4), correlation = cor(pull(merged_2021, chol_pre_2021), pull(merged_2021, each)))
  cor_table = rbind(cor_table, new_row)
}

sleep_row = data.frame(behavior = 'sleep', correlation = cor(pull(merged_2020, chol_pre_2020), pull(merged_2020, sleep_pre)))

cor_table = rbind(cor_table, sleep_row)
cor_table = arrange(cor_table, desc(correlation))

cor_table = cor_table |>
  mutate(behavior = case_match(behavior,
                               "lpa" ~ "No Leisure-time Physical Activity",
                               "sleep" ~ "Sleep Less Than 7 Hours",
                               "smoking" ~ "Smoking Currently",
                               "insurance" ~ "Lack of Health Insurance Currently",
                               "cholscreen" ~ "Cholesterol Screening",
                               "binge" ~ "Binge Drinking"
                               ),
         behavior = factor(behavior, levels = behavior)) |>
  rename("Behavior" = behavior,
         "Correlation" = correlation) 


knitr::kable(cor_table)
Behavior Correlation
No Leisure-time Physical Activity 0.6960863
Sleep Less Than 7 Hours 0.6695707
Smoking Currently 0.4931834
Lack of Health Insurance Currently 0.3606138
Cholesterol Screening 0.2124629
Binge Drinking -0.5014168

Use a bar plot to visulize it.

cor_bar = plot_ly(cor_table, x = ~Behavior, y = ~Correlation, type = 'bar', opacity = 0.8, marker = list(color = "MidnightBlue")) |>
  layout(title = 'Correlation Between High Cholesterol and Health Behaviors',
         xaxis = list(title = 'Behavior', tickangle=-30),
         yaxis = list(title = 'Correlation'))
  

cor_bar